From: Roger Pau Monné <roger.pau@citrix.com>
Subject: x86/vpt: fix race when migrating timers between vCPUs

The current vPT code will migrate the emulated timers between vCPUs
(change the pt->vcpu field) while just holding the destination lock,
either from create_periodic_time or pt_adjust_global_vcpu_target if
the global target is adjusted. Changing the periodic_timer vCPU field
in this way creates a race where a third party could grab the lock in
the unlocked region of pt_adjust_global_vcpu_target (or before
create_periodic_time performs the vcpu change) and then release the
lock from a different vCPU, creating a locking imbalance.

Introduce a per-domain rwlock in order to protect periodic_time
migration between vCPU lists. Taking the lock in read mode prevents
any timer from being migrated to a different vCPU, while taking it in
write mode allows performing migration of timers across vCPUs. The
per-vcpu locks are still used to protect all the other fields from the
periodic_timer struct.

Note that such migration shouldn't happen frequently, and hence
there's no performance drop as a result of such locking.

This is XSA-336.

Reported-by: Igor Druzhinin <igor.druzhinin@citrix.com>
Tested-by: Igor Druzhinin <igor.druzhinin@citrix.com>
Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
Reviewed-by: Jan Beulich <jbeulich@suse.com>
---
Changes since v2:
 - Re-order pt_adjust_vcpu to remove one if.
 - Fix pt_lock to not call pt_vcpu_lock, as we might end up using a
   stale value of pt->vcpu when taking the per-vcpu lock.

Changes since v1:
 - Use a per-domain rwlock to protect timer vCPU migration.

--- a/xen/arch/x86/hvm/hvm.c
+++ b/xen/arch/x86/hvm/hvm.c
@@ -658,6 +658,8 @@ int hvm_domain_initialise(struct domain
     /* need link to containing domain */
     d->arch.hvm.pl_time->domain = d;
 
+    rwlock_init(&d->arch.hvm.pl_time->pt_migrate);
+
     /* Set the default IO Bitmap. */
     if ( is_hardware_domain(d) )
     {
--- a/xen/arch/x86/hvm/vpt.c
+++ b/xen/arch/x86/hvm/vpt.c
@@ -153,23 +153,32 @@ static int pt_irq_masked(struct periodic
     return 1;
 }
 
-static void pt_lock(struct periodic_time *pt)
+static void pt_vcpu_lock(struct vcpu *v)
 {
-    struct vcpu *v;
+    read_lock(&v->domain->arch.hvm.pl_time->pt_migrate);
+    spin_lock(&v->arch.hvm.tm_lock);
+}
 
-    for ( ; ; )
-    {
-        v = pt->vcpu;
-        spin_lock(&v->arch.hvm.tm_lock);
-        if ( likely(pt->vcpu == v) )
-            break;
-        spin_unlock(&v->arch.hvm.tm_lock);
-    }
+static void pt_vcpu_unlock(struct vcpu *v)
+{
+    spin_unlock(&v->arch.hvm.tm_lock);
+    read_unlock(&v->domain->arch.hvm.pl_time->pt_migrate);
+}
+
+static void pt_lock(struct periodic_time *pt)
+{
+    /*
+     * We cannot use pt_vcpu_lock here, because we need to acquire the
+     * per-domain lock first and then (re-)fetch the value of pt->vcpu, or
+     * else we might be using a stale value of pt->vcpu.
+     */
+    read_lock(&pt->vcpu->domain->arch.hvm.pl_time->pt_migrate);
+    spin_lock(&pt->vcpu->arch.hvm.tm_lock);
 }
 
 static void pt_unlock(struct periodic_time *pt)
 {
-    spin_unlock(&pt->vcpu->arch.hvm.tm_lock);
+    pt_vcpu_unlock(pt->vcpu);
 }
 
 static void pt_process_missed_ticks(struct periodic_time *pt)
@@ -219,7 +228,7 @@ void pt_save_timer(struct vcpu *v)
     if ( v->pause_flags & VPF_blocked )
         return;
 
-    spin_lock(&v->arch.hvm.tm_lock);
+    pt_vcpu_lock(v);
 
     list_for_each_entry ( pt, head, list )
         if ( !pt->do_not_freeze )
@@ -227,7 +236,7 @@ void pt_save_timer(struct vcpu *v)
 
     pt_freeze_time(v);
 
-    spin_unlock(&v->arch.hvm.tm_lock);
+    pt_vcpu_unlock(v);
 }
 
 void pt_restore_timer(struct vcpu *v)
@@ -235,7 +244,7 @@ void pt_restore_timer(struct vcpu *v)
     struct list_head *head = &v->arch.hvm.tm_list;
     struct periodic_time *pt;
 
-    spin_lock(&v->arch.hvm.tm_lock);
+    pt_vcpu_lock(v);
 
     list_for_each_entry ( pt, head, list )
     {
@@ -248,7 +257,7 @@ void pt_restore_timer(struct vcpu *v)
 
     pt_thaw_time(v);
 
-    spin_unlock(&v->arch.hvm.tm_lock);
+    pt_vcpu_unlock(v);
 }
 
 static void pt_timer_fn(void *data)
@@ -309,7 +318,7 @@ int pt_update_irq(struct vcpu *v)
     int irq, pt_vector = -1;
     bool level;
 
-    spin_lock(&v->arch.hvm.tm_lock);
+    pt_vcpu_lock(v);
 
     earliest_pt = NULL;
     max_lag = -1ULL;
@@ -339,7 +348,7 @@ int pt_update_irq(struct vcpu *v)
 
     if ( earliest_pt == NULL )
     {
-        spin_unlock(&v->arch.hvm.tm_lock);
+        pt_vcpu_unlock(v);
         return -1;
     }
 
@@ -347,7 +356,7 @@ int pt_update_irq(struct vcpu *v)
     irq = earliest_pt->irq;
     level = earliest_pt->level;
 
-    spin_unlock(&v->arch.hvm.tm_lock);
+    pt_vcpu_unlock(v);
 
     switch ( earliest_pt->source )
     {
@@ -394,7 +403,7 @@ int pt_update_irq(struct vcpu *v)
                 time_cb *cb = NULL;
                 void *cb_priv;
 
-                spin_lock(&v->arch.hvm.tm_lock);
+                pt_vcpu_lock(v);
                 /* Make sure the timer is still on the list. */
                 list_for_each_entry ( pt, &v->arch.hvm.tm_list, list )
                     if ( pt == earliest_pt )
@@ -404,7 +413,7 @@ int pt_update_irq(struct vcpu *v)
                         cb_priv = pt->priv;
                         break;
                     }
-                spin_unlock(&v->arch.hvm.tm_lock);
+                pt_vcpu_unlock(v);
 
                 if ( cb != NULL )
                     cb(v, cb_priv);
@@ -441,12 +450,12 @@ void pt_intr_post(struct vcpu *v, struct
     if ( intack.source == hvm_intsrc_vector )
         return;
 
-    spin_lock(&v->arch.hvm.tm_lock);
+    pt_vcpu_lock(v);
 
     pt = is_pt_irq(v, intack);
     if ( pt == NULL )
     {
-        spin_unlock(&v->arch.hvm.tm_lock);
+        pt_vcpu_unlock(v);
         return;
     }
 
@@ -455,7 +464,7 @@ void pt_intr_post(struct vcpu *v, struct
     cb = pt->cb;
     cb_priv = pt->priv;
 
-    spin_unlock(&v->arch.hvm.tm_lock);
+    pt_vcpu_unlock(v);
 
     if ( cb != NULL )
         cb(v, cb_priv);
@@ -466,12 +475,12 @@ void pt_migrate(struct vcpu *v)
     struct list_head *head = &v->arch.hvm.tm_list;
     struct periodic_time *pt;
 
-    spin_lock(&v->arch.hvm.tm_lock);
+    pt_vcpu_lock(v);
 
     list_for_each_entry ( pt, head, list )
         migrate_timer(&pt->timer, v->processor);
 
-    spin_unlock(&v->arch.hvm.tm_lock);
+    pt_vcpu_unlock(v);
 }
 
 void create_periodic_time(
@@ -490,7 +499,7 @@ void create_periodic_time(
 
     destroy_periodic_time(pt);
 
-    spin_lock(&v->arch.hvm.tm_lock);
+    write_lock(&v->domain->arch.hvm.pl_time->pt_migrate);
 
     pt->pending_intr_nr = 0;
     pt->do_not_freeze = 0;
@@ -540,7 +549,7 @@ void create_periodic_time(
     init_timer(&pt->timer, pt_timer_fn, pt, v->processor);
     set_timer(&pt->timer, pt->scheduled);
 
-    spin_unlock(&v->arch.hvm.tm_lock);
+    write_unlock(&v->domain->arch.hvm.pl_time->pt_migrate);
 }
 
 void destroy_periodic_time(struct periodic_time *pt)
@@ -565,30 +574,20 @@ void destroy_periodic_time(struct period
 
 static void pt_adjust_vcpu(struct periodic_time *pt, struct vcpu *v)
 {
-    int on_list;
-
     ASSERT(pt->source == PTSRC_isa || pt->source == PTSRC_ioapic);
 
     if ( pt->vcpu == NULL )
         return;
 
-    pt_lock(pt);
-    on_list = pt->on_list;
-    if ( pt->on_list )
-        list_del(&pt->list);
-    pt->on_list = 0;
-    pt_unlock(pt);
-
-    spin_lock(&v->arch.hvm.tm_lock);
+    write_lock(&pt->vcpu->domain->arch.hvm.pl_time->pt_migrate);
     pt->vcpu = v;
-    if ( on_list )
+    if ( pt->on_list )
     {
-        pt->on_list = 1;
+        list_del(&pt->list);
         list_add(&pt->list, &v->arch.hvm.tm_list);
-
         migrate_timer(&pt->timer, v->processor);
     }
-    spin_unlock(&v->arch.hvm.tm_lock);
+    write_unlock(&pt->vcpu->domain->arch.hvm.pl_time->pt_migrate);
 }
 
 void pt_adjust_global_vcpu_target(struct vcpu *v)
--- a/xen/include/asm-x86/hvm/vpt.h
+++ b/xen/include/asm-x86/hvm/vpt.h
@@ -128,6 +128,13 @@ struct pl_time {    /* platform time */
     struct RTCState  vrtc;
     struct HPETState vhpet;
     struct PMTState  vpmt;
+    /*
+     * rwlock to prevent periodic_time vCPU migration. Take the lock in read
+     * mode in order to prevent the vcpu field of periodic_time from changing.
+     * Lock must be taken in write mode when changes to the vcpu field are
+     * performed, as it allows exclusive access to all the timers of a domain.
+     */
+    rwlock_t pt_migrate;
     /* guest_time = Xen sys time + stime_offset */
     int64_t stime_offset;
     /* Ensures monotonicity in appropriate timer modes. */
